Simulation Analysis

Statistical Analysis
Author

ThanachotL.

Published

December 19, 2022

Introduction

Description and Simulation In this working sheet, I will create an artificial data set, Age of population, with a composition technique, In otherword, I’d create six sample sets of population with a different mean of age to portray each interval age, and after that, elaborating the data with descriptive statistic, enabling us to understand the data from different perspective

Construct a Data Set

import pandas as pd
import numpy as np
import seaborn as sns
import random
import plotly.offline as py
import plotly.graph_objects as go
import plotly.express as px
#import chart_studio.plotly as py
"""np.random.normal o generate a vector of random values that follow a normal distribution 
with a specific mean and standard deviation: mean, sd, size """ 

#Note: Because making up the data by implementing normoal distribution so some obeservation is negative
#but age can't be negative, need to handle that later
random.seed(112)
a = np.random.normal(loc=5, scale=7, size=2000)
b = np.random.normal(loc=5, scale=7, size=2000)
c = np.random.normal(loc=35, scale=5, size=2500)
d = np.random.normal(loc=50, scale=8, size=3000)
e = np.random.normal(loc=70, scale=5, size=1000)
f = np.random.normal(loc=80, scale=7, size=1000)
#combine all the vector
pop = np.concatenate((a, b, c, d, e, f))
pop.shape
(11500,)
# making all data become absolute value and casting the type to int
pop = np.absolute(pop).astype(int)
# exclud age = 0
pop = pop[pop != 0]
pop
array([ 8,  4,  9, ..., 75, 87, 63])
# numbers of obeservation decreases not significantly, so it should be fine
pop.shape
(11136,)
df = pd.DataFrame(pop, columns = ['Age'])
sex = ['Male','Female']
#random.choice() is a function using to pick a random value from a list
print(random.choice(sex))
Female
df['Sex'] = random.choice(sex) # need to fill up an attribute with some values first
df['Sex'] = [ random.choice(sex) for i in df['Sex']  ] #apply random choice with list expressions
df
Age Sex
0 8 Female
1 4 Female
2 9 Male
3 1 Female
4 6 Male
... ... ...
11131 67 Male
11132 73 Female
11133 75 Female
11134 87 Male
11135 63 Female

11136 rows × 2 columns

df
Age Sex
0 8 Female
1 4 Female
2 9 Male
3 1 Female
4 6 Male
... ... ...
11131 67 Male
11132 73 Female
11133 75 Female
11134 87 Male
11135 63 Female

11136 rows × 2 columns

df.groupby(['Sex']).count()
Age
Sex
Female 5524
Male 5612

Explore data wiht some visualizations

#Prepare the data
"""
I want to make a pyramid population catagorized by gender aging interval. First step is that 
I might  need  to put each person into different bins depending on their age
"""
'\nI want to make a pyramid population catagorized by gender aging interval. First step is that \nI might  need  to put each person into different bins depending on their age\n'
# create the age_interval with 5 bins
df.loc[df['Age'].between(1, 20, 'both'), 'Age_Interval'] = '1-20'
df.loc[df['Age'].between(20, 40, 'right'), 'Age_Interval'] = '21-40'
df.loc[df['Age'].between(40, 60, 'right'), 'Age_Interval'] = '41-60'
df.loc[df['Age'].between(60, 80, 'right'), 'Age_Interval'] = '61-80'
df.loc[df['Age'].between(80, 100, 'right'), 'Age_Interval'] = '81-100'

#Credit: https://medium.com/towards-data-science/how-to-bin-numerical-data-with-pandas-fe5146c9dc55
df
Age Sex Age_Interval
0 8 Female 1-20
1 4 Female 1-20
2 9 Male 1-20
3 1 Female 1-20
4 6 Male 1-20
... ... ... ...
11131 67 Male 61-80
11132 73 Female 61-80
11133 75 Female 61-80
11134 87 Male 81-100
11135 63 Female 61-80

11136 rows × 3 columns

df1 = df.groupby(['Age_Interval','Sex'])[['Age']].count().reset_index().rename(columns={'Age':'Number_of_Pop'})
#Noticing that the first attribute is repetitive, this is long format so we need to convert them to wide format
#For analysis purposes, mostly we want wide format, but ploting graphg by R or some analytical tool might require long format
df1
Age_Interval Sex Number_of_Pop
0 1-20 Female 1812
1 1-20 Male 1775
2 21-40 Female 1295
3 21-40 Male 1379
4 41-60 Female 1314
5 41-60 Male 1334
6 61-80 Female 890
7 61-80 Male 886
8 81-100 Female 213
9 81-100 Male 238
#pivot method is used to convert from long to wide
df2 =pd.pivot(df1,index='Age_Interval' ,columns='Sex', values='Number_of_Pop')

#Credit: https://towardsdatascience.com/reshaping-a-pandas-dataframe-long-to-wide-and-vice-versa-517c7f0995ad
df2
Sex Female Male
Age_Interval
1-20 1812 1775
21-40 1295 1379
41-60 1314 1334
61-80 890 886
81-100 213 238
df2['Female'].dtype
dtype('int64')
women_bins = [i*-1 for i in df2['Female']]
len(women_bins)
5


women_bins = np.array(women_bins)
men_bins = np.array(df2['Male'])

df3 = df2.reset_index() # dropping Age_Interval from being an index 
y =list(df3['Age_Interval']) # convert to list and utilize it as YAxis

layout = go.Layout(yaxis=go.layout.YAxis(title='Age'),
                   xaxis=go.layout.XAxis(
                       range=[-2200, 2200],
                       tickvals=[-2000, -1500, -1000, -500, 0, 500, 1000, 1500, 2000],
                       ticktext=[2000, 1500, 1000, 500, 0, 500, 1000, 1500, 2000],
                       title='Number_of_Population'),
                   barmode='overlay',
                   bargap=0.1)

data = [go.Bar(y=y,
               x=men_bins,
               orientation='h',
               name='Men',
               hoverinfo='x',
               marker=dict(color='powderblue')
               ),
        go.Bar(y=y,
               x=women_bins,
               orientation='h',
               name='Women',
               text=-1 * women_bins.astype('int'),
               hoverinfo='text',
               marker=dict(color='seagreen')
               )]

py.iplot(dict(data=data, layout=layout), filename='EXAMPLES/bar_pyramid')

Pyramid graph above giving an overall sense of population grouped by aga interval and sexuality. We can see that majority of population is in the range of 1-20 yeas old.

Measure of Central Tendency And Dispsersion of data

Now, I want to explore the central tendency in age of population.Thus, backing to work with data before catogorizing them intp different bin. So as to gain better understanding, I’d show some calculation to get descriptive statistic before using function to get those result

pop # age of individuals, population
array([ 8,  4,  9, ..., 75, 87, 63])
pop.shape # number of populaiton
(11136,)
mean_pop = round(sum(pop)/len(pop))
mean_pop
# Average age of population is 37 years old
37
np.median(pop)
# Median of pop is 37
37.0
# defining a function to calculate mode. It
# takes list variable as argument
def mode(lst):
     
    # creating a dictionary
    freq = {}
    for i in lst:
       
        # mapping each value of list to a
        # dictionary
        freq.setdefault(i, 0)
        freq[i] += 1
         
    # finding maximum value of dictionary
    hf = max(freq.values())
     
    # creating an empty list
    hflst = []
     
    # using for loop we are checking for most
    # repeated value
    for i, j in freq.items():
        if j == hf:
            hflst.append(i)
             
    # returning the result
    return hflst
 
# calling mode() function and passing list
# as argument
print(mode(pop))

#Credit: https://www.geeksforgeeks.org/how-to-calculate-the-mode-of-numpy-array/
[1]
pop
array([ 8,  4,  9, ..., 75, 87, 63])
#Observing the diispersion of the data by geting the deviation
#With that, we substract each element from the mean
dev = []
for i in pop:
    temp = i-mean_pop
    dev.append(temp)
#deviation of mean
dev = np.array(dev)
dev
array([-29, -33, -28, ...,  38,  50,  26])
np.mean(dev) # the mean of deviation is usually be zero 
-0.20510057471264367
#absolute deviation of mean
dev = abs(dev)
dev
array([29, 33, 28, ..., 38, 50, 26])
#Mean Absolute deviation or 'MAD'
np.mean(dev)
print("Mean Absolute deviation is % s "
                % (np.mean(dev)))
Mean Absolute deviation is 20.97162356321839 
import statistics
pop_list = pop.tolist()
print("Standard Deviation of sample is % s "
                % (statistics.stdev(pop_list)))
Standard Deviation of sample is 25.072255798045703 

Mean absolute deviation (MAD) is a measure of the average absolute distance between each data value and the mean of a data set. Similar to standard deviation, MAD is a parameter or statistic that measures the spread, or variation, in your data.

Even Both MAD and SD measuring the spread of data,but SD is usually bigger than MAD as SD more sensitive to values that are farther away from the mean for more detail on MAD and SD: https://articles.outlier.org/mean-absolute-deviation-meaning

Describing Dispersion

#Using describe to see basic describtive measure ment
df['Age'].describe()
count    11136.000000
mean        36.794899
std         25.072256
min          1.000000
25%         11.000000
50%         37.000000
75%         55.000000
max        100.000000
Name: Age, dtype: float64
#Range
range = max(df['Age'])-min(df['Age'])
print('Range is %s'
      %range)
Range is 99
#Interquartile range = Q3 – Q1
q1, q3 = np.percentile(df.Age,[25,75]) 
iqr = q3 - q1
print(iqr)
44.0

Detecting Outlier

definition of outlier here is any point of data whihc is beyond the line of lower limit(Q1 - 1.5IQR) or upper limit(Q3 + 1.5IQR)

#firn lower limit and upper limit
lower_limit = q1 - (1.5*iqr)
upper_limit = q3 + (1.5*iqr)
print(lower_limit, upper_limit)

# this could be conclude that if anyone in our population is, at age, more than 120 years old could be considered as outlier
-55.0 121.0
# try selecting a sample set and consider its statistic measurement
age_list = df.Age.tolist()
sample = random.sample(age_list,2500)
sample = pd.DataFrame(sample)
sample.describe()
0
count 2500.000000
mean 36.736800
std 24.606489
min 1.000000
25% 11.000000
50% 37.000000
75% 54.000000
max 94.000000
df['Age'].describe()
count    11136.000000
mean        36.794899
std         25.072256
min          1.000000
25%         11.000000
50%         37.000000
75%         55.000000
max        100.000000
Name: Age, dtype: float64
# the statistical measurement of sample set and population are quite similar
# the sample set well represent the population 
df
Age Sex Age_Interval
0 8 Female 1-20
1 4 Female 1-20
2 9 Male 1-20
3 1 Female 1-20
4 6 Male 1-20
... ... ... ...
11131 67 Male 61-80
11132 73 Female 61-80
11133 75 Female 61-80
11134 87 Male 81-100
11135 63 Female 61-80

11136 rows × 3 columns

# performing visualization of population using boxplot, categotized by sexuality

df_box = df
fig = px.box(df_box, x="Sex", y="Age")
fig.show()

# from out artificial data here, I'd try adding 'country attribute', and making more dynamic visualization
df
Age Sex Age_Interval
0 8 Female 1-20
1 4 Female 1-20
2 9 Male 1-20
3 1 Female 1-20
4 6 Male 1-20
... ... ... ...
11131 67 Male 61-80
11132 73 Female 61-80
11133 75 Female 61-80
11134 87 Male 81-100
11135 63 Female 61-80

11136 rows × 3 columns

country = ['Thailand', 'Taiwan', 'Japan', 'Germany']
df['Country'] = random.choice(country) 
df['Country'] = [ random.choice(country) for i in df['Country'] ] 
df
Age Sex Age_Interval Country
0 8 Female 1-20 Germany
1 4 Female 1-20 Japan
2 9 Male 1-20 Japan
3 1 Female 1-20 Germany
4 6 Male 1-20 Japan
... ... ... ... ...
11131 67 Male 61-80 Taiwan
11132 73 Female 61-80 Japan
11133 75 Female 61-80 Japan
11134 87 Male 81-100 Thailand
11135 63 Female 61-80 Thailand

11136 rows × 4 columns

from dash import Dash, dcc, html, Input, Output
from jupyter_dash import JupyterDash

app = JupyterDash(__name__)


app.layout = html.Div([
    html.H4("Analysis of Age Distribution in Population"),
    html.P("x-axis:"),
    dcc.Checklist(
        id='x-axis', 
        options=['Country', 'Sex'], 
        inline=True
    ),
    html.P("y-axis:"),
    dcc.RadioItems(
        id='y-axis', 
        value='Age', 
        inline=True
    ),
    dcc.Graph(id="graph"),
])


@app.callback(
    Output("graph", "figure"), 
    Input("x-axis", "value"), 
    Input("y-axis", "value"))
def generate_chart(x, y):
    df_box = df # replace with your own data source
    fig = px.box(df, x=x, y=y)
    return fig


if __name__ == '__main__':
    app.run_server(mode="inline")